#######################################################################################
# Replication materials for 
#
# Richard Traunmüller, Andreas Murr &  Jeff Gill 
# "Modeling Latent Information in Voting Data with Dirichlet Process Priors"
# Political Analysis
#
# Version:  October 7th 2014
#
# Requires: GlesWuest.dta (dataset)
#           RecodeGles2009LikeWuest2011.do (create replication data set, optional)
#           JeffGlmdmCode.R (glmdm model code)
#
#######################################################################################

# load packages

  source("JeffGlmdmCode.R")
	library(foreign)
	library(Hmisc)
	library(ROCR)

# load data

	data <- read.dta("GlesWuest.dta", convert.factors=F)

# OR: replicate this data set by downloading the original dataset ("ZA5302_en_v6-0-0.dta") from 
# the GESIS website: www.gesis.org/wahlen/gles/ 
# and recoding it with "RecodeGles2009LikeWuest2011.do"
# this do file saves the data set as "GlesWuestReplication.dta" 
# which you can load in R with
# data <- read.dta("GlesWuestReplication.dta", convert.factors=F)

# generate new variables

	data$cdu <- ifelse(data$secondvote==1, 1, 0)
	data$left <- ifelse(data$secondvote==2 | data$secondvote==4 | data$secondvote==5, 1, 0)
	data$partisanship <- ifelse(data$pid==1, 1, 0)

# Standardize (quasi-)metric variables

	data$age <- (data$age - mean(data$age, na.rm = TRUE)) / (2*sd(data$age, na.rm = TRUE))
	data$class <- (data$class - mean(data$class, na.rm = TRUE)) / (2*sd(data$class, na.rm = TRUE))
	data$interest <- (data$interest - mean(data$interest, na.rm = TRUE)) / (2*sd(data$interest, na.rm = TRUE))
	data$strengthpid <- (data$strengthpid - mean(data$strengthpid, na.rm = TRUE)) / (2*sd(data$strengthpid, na.rm = TRUE))
	data$civicduty <- (data$civicduty - mean(data$civicduty, na.rm = TRUE)) / (2*sd(data$civicduty, na.rm = TRUE))
	data$absdiffeval <- (data$absdiffeval - mean(data$absdiffeval, na.rm = TRUE)) / (2*sd(data$absdiffeval, na.rm = TRUE))
	data$satisf <- (data$satisf - mean(data$satisf, na.rm = TRUE)) / (2*sd(data$satisf, na.rm = TRUE))
	data$attendance <- (data$attendance - mean(data$attendance, na.rm = TRUE)) / (2*sd(data$attendance, na.rm = TRUE))
	data$leftright <- (data$leftright - mean(data$leftright, na.rm = TRUE)) / (2*sd(data$leftright, na.rm = TRUE))
	data$spending <- (data$spending - mean(data$spending, na.rm = TRUE)) / (2*sd(data$spending, na.rm = TRUE))
	data$immigration <- (data$immigration - mean(data$immigration, na.rm = TRUE)) / (2*sd(data$immigration, na.rm = TRUE))
	data$nuclear <- (data$nuclear - mean(data$nuclear, na.rm = TRUE)) / (2*sd(data$nuclear, na.rm = TRUE))

# select post-election and people with migration background

	dataTur <- na.omit(subset(data[data$survey==2 & data$migrbckgr==1,], select = c("turnout", "firstgeneration", "aussiedler", "guestworker", "otherlanguage", "other", "female", "age", "educ", "employ", "member", "class", "interest", "strengthpid", "civicduty", "absdiffeval", "satisf")))
	
	dataPid <- na.omit(subset(data[data$survey==2 & data$migrbckgr==1,], select = c("cdu", "firstgeneration", "aussiedler", "guestworker", "otherlanguage", "other", "female", "age", "educ", "employ", "member", "attendance", "worker", "selfempl", "leftright", "spending", "immigration", "nuclear", "partisanship")))

# write functions for cross-validation

	mean.beta <- function(m){
		apply(m$beta.sims, 2, mean)
	}

	Split4 <- function(DV){
		# divide the dataset into 4 groups
		totalcases <- dim(DV)[1]
		group1size <- trunc(totalcases / 4)
		group2size <- trunc(totalcases / 4)
		group3size <- trunc(totalcases / 4)
		group4size <- totalcases - group1size - group2size - group3size
		# create a vector to sample from
		allcases <- 1:totalcases 
		s1 <- sample(allcases, group1size)
		remainder1 <- setdiff(allcases, s1)
		s2 <- sample(remainder1, group2size)
		remainder2 <- setdiff(allcases, c(s1, s2))
		s3 <- sample(remainder2, group3size)
		s4 <- setdiff(allcases, c(s1, s2, s3))
		# create a set of 4 trainingsets and testsets
		trainingset <- list(c(s1, s2, s3), c(s2, s3, s4), c(s1, s3, s4), c(s1, s2, s4))
		testset <- list(s4, s1, s2, s3)
		list(trainingset, testset, iteration=1:4)				
	}

	Split4by10 <- function(DV){
		Splits <- list()
		for (i in 1:10){
			Splits[[i]] <- Split4(DV)
		}
	Splits
	}

# ===============================================================
# = exclude iterations with perfect separation in glm (turnout) =
# ===============================================================

# split data

	set.seed(1)
	SplitTur4 <- Split4by10(DV = subset(dataTur, select = "turnout"))
			
# estimate models

	options(warn=1)	
	for (i in 1:10){
		print(paste("interation:", i))
		for (j in 1:4){
			print(j)
			glm(turnout ~ firstgeneration + aussiedler + guestworker + otherlanguage + other + female + age + educ + employ + member + class + interest + strengthpid + civicduty + absdiffeval + satisf, data=dataTur[SplitTur4[[i]][[1]][[j]],], family=binomial(link="probit"))
		}
	}

# note iterations with problems

	all <- 1:4
	IterTur4 <- list()
	IterTur4[[1]] <- all[-c(1)]
	IterTur4[[2]] <- all[-c(1, 2)]
	IterTur4[[3]] <- all[-c(1)]
	IterTur4[[4]] <- all[-c(1)]
	IterTur4[[5]] <- all[-c(3)]
	IterTur4[[6]] <- all[-c(1)]
	IterTur4[[7]] <- all[-c(1)]
	IterTur4[[8]] <- all[-c(4)]
	IterTur4[[9]] <- all[-c(1)]
	IterTur4[[10]] <- all[-c(4)]

# select data without problems

	TurTrainSel4 <- list()
	TurTestSel4 <- list()
	for (i in 1:10){
		TurTrainSel4[[i]] <- SplitTur4[[i]][[1]][IterTur4[i][[1]]]
		TurTestSel4[[i]] <- SplitTur4[[i]][[2]][IterTur4[i][[1]]]
	}

# check whether selection works

	nvalTur4 <- unlist(lapply(IterTur4, length))
	sum(nvalTur4)
		
	options(warn=1)	
	for (i in 1:10){
		print(paste("interation:", i))
		for (j in 1:nvalTur4[i]){
			print(j)
			glm(turnout ~ firstgeneration + aussiedler + guestworker + otherlanguage + other + female + age + educ + employ + member + class + interest + strengthpid + civicduty + absdiffeval + satisf, data=dataTur[TurTrainSel4[[i]][[j]],], family=binomial(link="probit"))
		}
	}	

# ===================================================================
# = exclude iterations with perfect separation in glm (vote choice) =
# ===================================================================

# split data

	set.seed(1)
	SplitPid4 <- Split4by10(DV = subset(dataPid, select = "cdu"))
		
# estimate models

	options(warn=1)	
	for (i in 1:10){
		print(paste("interation:", i))
		for (j in 1:4){
			print(j)
			glm(cdu ~ firstgeneration + aussiedler + guestworker + otherlanguage + other + female + age + educ + employ + member + attendance + worker + selfempl + leftright + spending + immigration + nuclear + partisanship, data=dataPid[SplitPid4[[i]][[1]][[j]],], family=binomial(link="probit"))
		}
	}

# note iterations with problems

	all <- 1:4
	IterPid4 <- list()
	IterPid4[[1]] <- all[-c(1, 3, 4)]
	IterPid4[[2]] <- all[-c(1, 2, 4)]
	IterPid4[[3]] <- all[-c(2, 4)]
	IterPid4[[4]] <- all[-c(1, 2, 3, 4)]
	IterPid4[[5]] <- all[-c(2, 3, 4)]
	IterPid4[[6]] <- all[-c(1, 2)]
	IterPid4[[7]] <- all[-c(2, 4)]
	IterPid4[[8]] <- all[-c(1, 2, 3)]
	IterPid4[[9]] <- all[-c(1, 2, 4)]
	IterPid4[[10]] <- all[-c(1, 2, 3, 4)]

# select data without problems

	PidTrainSel4 <- list()
	PidTestSel4 <- list()
	for (i in 1:10){
		PidTrainSel4[[i]] <- SplitPid4[[i]][[1]][IterPid4[i][[1]]]
		PidTestSel4[[i]] <- SplitPid4[[i]][[2]][IterPid4[i][[1]]]
	}

# check whether selection works

	nvalPid4 <- unlist(lapply(IterPid4, length))
	sum(nvalPid4)
	
	options(warn=1)	
	for (i in c(1:3, 5:8)){
		print(paste("interation:", i))
		for (j in 1:nvalPid4[i]){
			print(j)
			glm(cdu ~ firstgeneration + aussiedler + guestworker + otherlanguage + other + female + age + educ + employ + member + attendance + worker + selfempl + leftright + spending + immigration + nuclear + partisanship, data=dataPid[PidTrainSel4[[i]][[j]],], family=binomial(link="probit"))
		}
	}	

# ====================
# = cross validation =
# ====================

# turnout

	formula <- "turnout ~ firstgeneration + aussiedler + guestworker + otherlanguage + other + female + age + educ + employ + member + class + interest + strengthpid + civicduty + absdiffeval + satisf"
	# count number of sets
	k <- length(unlist(TurTrainSel4, recursive=FALSE))
	# placeholders for AUCs
	AUC.glm <- rep(NA, k)
	AUC.glmdm <- rep(NA, k)
	# placeholders for the phats and actuals
	phats.glm <- list()
	phats.glmdm <- list()
	actuals <- list()
	# loop through sets
	for (i in 1:k){
		trainingdata <- dataTur[unlist(TurTrainSel4, recursive=FALSE)[[i]],]
		testdata <- dataTur[unlist(TurTestSel4, recursive=FALSE)[[i]],]
		# estimate the glm and glmdm model using the trainingset
		model.glm.current <- glm(formula, data = trainingdata, family=binomial(link="probit"))
		model.glmdm.current <- glmdm(formula, data = trainingdata, family = binomial(link = "probit"), num.reps = 5000)
		# make the out-of-sample predictions
		betas.glm <- model.glm.current$coef
		betas.glmdm <- mean.beta(model.glmdm.current)
		phats.glm[[i]] <- pnorm(cbind(1, as.matrix(testdata[,-1])) %*% betas.glm)
		phats.glmdm[[i]] <- pnorm(cbind(1, as.matrix(testdata[,-1])) %*% betas.glmdm)
		# compare these predictions to the actuals,  and calculate the AUC
		actuals[[i]] <- testdata[,1]
		AUC.glm[i] <- somers2(phats.glm[[i]], actuals[[i]])[1]
		AUC.glmdm[i] <- somers2(phats.glmdm[[i]], actuals[[i]])[1]		
	}
	CvalTur <- list(phats.glm=phats.glm, phats.glmdm=phats.glmdm, actuals=actuals, AUC.glm=AUC.glm, AUC.glmdm=AUC.glmdm)

	save(CvalTur, file = paste("Cval4by10Tur", format(Sys.time(), "%d%b%Y"), ".RData", sep = ""))

# vote choice without partisanship

	formula <- "cdu ~ firstgeneration + aussiedler + guestworker + otherlanguage + other + female + age + educ + employ + member + attendance + worker + selfempl + leftright + spending + immigration + nuclear + partisanship"
	# count number of sets
	k <- length(unlist(PidTrainSel4, recursive=FALSE))
	# placeholders for AUCs
	AUC.glm <- rep(NA, k)
	AUC.glmdm <- rep(NA, k)
	# placeholders for the phats and actuals
	phats.glm <- list()
	phats.glmdm <- list()
	actuals <- list()
	# loop through sets
	for (i in 1:k){
		trainingdata <- dataPid[unlist(PidTrainSel4, recursive=FALSE)[[i]],]
		testdata <- dataPid[unlist(PidTestSel4, recursive=FALSE)[[i]],]
		# estimate the glm and glmdm model using the trainingset
		model.glm.current <- glm(formula, data = trainingdata, family=binomial(link="probit"))
		model.glmdm.current <- glmdm(formula, data = trainingdata, family = binomial(link = "probit"), num.reps = 5000)
		# make the out-of-sample predictions
		betas.glm <- model.glm.current$coef
		betas.glmdm <- mean.beta(model.glmdm.current)
		phats.glm[[i]] <- pnorm(cbind(1, as.matrix(testdata[,-1])) %*% betas.glm)
		phats.glmdm[[i]] <- pnorm(cbind(1, as.matrix(testdata[,-1])) %*% betas.glmdm)
		# compare these predictions to the actuals,  and calculate the AUC
		actuals[[i]] <- testdata[,1]
		AUC.glm[i] <- somers2(phats.glm[[i]], actuals[[i]])[1]
		AUC.glmdm[i] <- somers2(phats.glmdm[[i]], actuals[[i]])[1]		
	}
	CvalPid <- list(phats.glm=phats.glm, phats.glmdm=phats.glmdm, actuals=actuals, AUC.glm=AUC.glm, AUC.glmdm=AUC.glmdm)

	save(CvalPid, file = paste("Cval4by10Pid", format(Sys.time(), "%d%b%Y"), ".RData", sep = ""))

# ================
# = save session =
# ================

	save.image(file = paste("Cval4by10Fold", format(Sys.time(), "%d%b%Y"), ".RData", sep = ""))

# ===================
# = end source code =
# ===================	